suppressPackageStartupMessages({
library(Seurat)
library(tidyverse)
library(SingleCellExperiment)
library(scater)
library(pbapply)
library(future)
library(reshape2)
library(ggpubr)
library(UpSetR)
})
source("~/multiOmic_benchmark/selectFeatures.R")
Load F74 datasets (preprocessed as in makeSCElist.r). Includes raw and processed ATAC-seq and RNA-seq dataset. ATAC-seq data was reduced to gene level features as in Seurat pipeline.
f74.sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191017.RDS")
f74.atac <- f74.sce.list$ATAC
f74.rna <- f74.sce.list$RNA
Compare betwen HVGs from ATAC and HVGs from RNA (i.e. genes used for cell type annotation)
n_features = 4000
hvg.rna <- HVG_Seurat(f74.sce.list[["RNA"]], nfeatures = n_features) %>% {.[. %in% rownames(f74.sce.list[["ATAC"]])]}
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
hvg.atac <- HVG_Seurat(f74.atac, nfeatures = length(hvg.rna))
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
upset(fromList(list(hvg.rna=hvg.rna, hvg.atac=hvg.atac)))
f74.atac.seu <- as.Seurat(f74.atac, verbose=FALSE)
f74.atac.seu <- ScaleData(f74.atac.seu, do.center = TRUE, verbose=FALSE)
f74.atac.seu <- RunPCA(f74.atac.seu, features = hvg.rna, verbose=FALSE)
f74.atac.seu <- RunUMAP(f74.atac.seu, reduction = "pca", dims=1:30, verbose=FALSE)
DimPlot(f74.atac.seu, reduction = "umap", group.by = NULL) + ggtitle("scRNA-seq HVGs")
Provided by Cecilia
Genes that are distinctive of the same clusters will tend to be co-expressed in the same cells. Hence, to test if marker genes show patterning also in accessibility, I calculate correlation of accessibility profiles of thymus development marker genes and compare co-accessibility with their correlation in gene expression.
markers <- c('PTPRC','CD4','CD8A','CD8B','CD79A','FOXN1','EPCAM','PDGFRA','GNG4',
'FOXP3','RAG1','RAG2','PTCRA','IL7R','NKG7','CCR7')
markers <- intersect(markers, rownames(f74.atac.seu@assays$RNA@counts))
## Calculate co-accessibility of marker genes
marker.cormat.atac <-
f74.atac.seu@assays$RNA@scale.data[markers,] %>%
as.matrix() %>%
t() %>%
cor(use="pairwise.complete.obs")
## Same on RNA
f74.rna.seu <- as.Seurat(f74.rna)
f74.rna.seu <- ScaleData(f74.rna.seu)
Centering and scaling data matrix
|
| | 0%
|
|== | 3%
|
|===== | 6%
|
|======= | 9%
|
|========= | 12%
|
|============ | 15%
|
|============== | 18%
|
|================ | 21%
|
|=================== | 24%
|
|===================== | 26%
|
|======================== | 29%
|
|========================== | 32%
|
|============================ | 35%
|
|=============================== | 38%
|
|================================= | 41%
|
|=================================== | 44%
|
|====================================== | 47%
|
|======================================== | 50%
|
|========================================== | 53%
|
|============================================= | 56%
|
|=============================================== | 59%
|
|================================================= | 62%
|
|==================================================== | 65%
|
|====================================================== | 68%
|
|======================================================== | 71%
|
|=========================================================== | 74%
|
|============================================================= | 76%
|
|================================================================ | 79%
|
|================================================================== | 82%
|
|==================================================================== | 85%
|
|======================================================================= | 88%
|
|========================================================================= | 91%
|
|=========================================================================== | 94%
|
|============================================================================== | 97%
|
|================================================================================| 100%
marker.cormat.rna <-
f74.rna.seu@assays$RNA@scale.data[markers,] %>%
as.matrix() %>%
t() %>%
cor(use="pairwise.complete.obs")
We can see that the co-accessibility signal is much weaker than the co-expression signal, but these are significantly correlated.
png("~/multiOmic_benchmark/output/ATAC_EDA/ATAC_hvg_coaccessibility.png", height=500, width=500)
pheatmap::pheatmap(marker.cormat.atac, main="co-accessibility", cellwidth = 20, cellheight = 20)
dev.off()
null device
1
Control w random genes
PDGFRA is not in the ATAC gene activity matrix (not covered enough?), so I check for genes with high co-expression with PDGFRA in RNA.
fb_markers <- c("PDGFRA")
pdgfra.x <- f74.rna.seu@assays$RNA@scale.data["PDGFRA",]
pdgfra.cor <- apply(f74.rna.seu@assays$RNA@scale.data[hvg.rna,], 1, function(x) cor(x=x, y=pdgfra.x, use="complete.obs"))
fb_markers_atac <- pdgfra.cor[which(pdgfra.cor > 0.7)]
FeaturePlot(f74.atac.seu, features = names(fb_markers_atac))
Compare with expression in RNA
f74.rna.seu <- ScaleData(f74.rna.seu, features = hvg.rna)
Centering and scaling data matrix
|
| | 0%
|
|======================================== | 33%
|
|================================================================================ | 67%
|
|========================================================================================================================| 100%
f74.rna.seu <- RunPCA(f74.rna.seu, features = hvg.rna)
PC_ 1
Positive: TRBC2, LTB, CORO1A, CD1A, SMPD3, NPPC, STMN1, CHI3L2, HMGA1, LCP1
RGCC, LAPTM5, TRAC, IL7R, TRAV13-2, MCTP1, HIST1H1C, ITM2A, HIST1H2BJ, CD82
IL32, MT1X, RP3-395M20.12, SH2D2A, TRAV13-1, SMIM24, CDKN2C, TRAV8-4, PTMA, LINC01550
Negative: CALD1, COL5A2, COL6A1, SPARC, NFIB, TSHZ2, COL3A1, NID1, CPE, PLAC9
EFEMP2, MAP1B, FLRT2, FSTL1, LAMB1, COL5A1, AHNAK, MMP2, BGN, LUM
AKAP12, OLFML3, COL1A1, CXCL12, IGFBP4, LAMA4, MMP14, LAMC1, COL12A1, PXDN
PC_ 2
Positive: LTB, OLFML3, COL5A2, MMP2, SFRP1, MXRA8, COL12A1, PLAC9, PLAT, TSHZ2
CALD1, CPE, NTRK2, SMOC2, TSC22D3, LUM, NID1, BOC, COL5A1, NUPR1
OSR1, MDFI, DKK3, COL6A1, NFIB, LAMB1, CERCAM, LAMA4, PRRX1, LRP1
Negative: UBE2T, TYMS, CENPM, RRM2, CENPW, ASF1B, MAD2L1, CENPU, CCNA2, CDK1
DTYMK, RRM1, UHRF1, DHFR, LMNB1, MKI67, CDT1, HMGB3, MND1, NUSAP1
ZWINT, TOP2A, KIFC1, TACC3, KIF22, CDCA5, GMNN, CENPF, NCAPG, NCAPH
PC_ 3
Positive: HLA-DRA, HLA-DRB5, HLA-DRB1, TYROBP, HLA-DPA1, HLA-DQA1, CD74, HLA-DQB1, SAMHD1, C1QC
RNASE1, HCK, HLA-DMB, SPI1, HLA-B, STAB1, CSF1R, CD36, CTSH, FOLR2
TMEM176B, GIMAP4, FCER1G, IRF8, CSF2RA, CD86, PLVAP, MMP9, PLEK, HLA-DMA
Negative: NTRK2, SFRP1, PLAT, SCARA5, GPC3, CREB3L1, SFRP2, PTPRD, OSR1, DLK1
TMEFF2, OLFML3, C1QTNF2, CERCAM, GDF10, CDO1, DKK3, KAZALD1, EBF2, MXRA8
SMOC2, HTRA3, SGCD, THBS2, COL16A1, MMP2, CLMP, HAND2-AS1, KDELR3, COL12A1
PC_ 4
Positive: APLNR, CDH5, COL4A1, COL15A1, CAV1, CRIP2, CXCL13, COL4A2, MADCAM1, KDR
COX4I2, CLDN5, PAPLN, MYLK, NTS, PODXL, SPNS2, RGS5, ESAM, BCAM
CCL17, HPN, CXorf36, CYP1B1, S100A16, SOX18, TMEM88, TIE1, KCNJ8, CCL2
Negative: CSF1R, PLD4, HCK, ADAP2, CD86, FOLR2, MARCH1, MRC1, SPI1, HLA-DMB
CD14, F13A1, SIGLEC1, TGFBI, CEBPA, LY86, IL6R, SLC15A3, CSF2RA, C3AR1
FCGR2A, AGR2, FGD2, BLVRB, PTAFR, SLAMF8, CXCL16, IRF5, MMP9, HRH1
PC_ 5
Positive: DMTN, ANK1, RHAG, KLF1, TMOD1, TMEM56, HBZ, C17orf99, GMPR, PHOSPHO1
HBQ1, CR1L, HBM, TRIM58, RHCE, EPB42, SPTA1, TAL1, OSBP2, FAXDC2
SOX6, MYL4, SMIM1, MFSD2B, TUBB1, SELENBP1, TFR2, GFI1B, SPTB, CTSE
Negative: TMSB4X, PTMA, NPM1, ACTB, CORO1A, CYBA, LSP1, RPL35, ID3, CNN2
ARPC2, PPA1, C1QBP, RPS17, RPL22L1, SEPW1, IFITM2, STMN1, ISG15, COX5A
CTSC, HSPD1, PKM, LTB, GPI, HSP90AB1, ALDOA, PRMT1, CCND2, LY6E
f74.rna.seu <- RunUMAP(f74.rna.seu, reduction = "pca", dims=1:30)
07:46:17 UMAP embedding parameters a = 0.9922 b = 1.112
07:46:17 Read 8321 rows and found 30 numeric columns
07:46:17 Using Annoy for neighbor search, n_neighbors = 30
07:46:17 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
07:46:19 Writing NN index file to temp file /tmp/RtmpAUvvXC/file52fa48b1d76e
07:46:19 Searching Annoy index using 1 thread, search_k = 3000
07:46:22 Annoy recall = 100%
07:46:22 Commencing smooth kNN distance calibration using 1 thread
07:46:24 Initializing from normalized Laplacian + noise
07:46:24 Commencing optimization for 500 epochs, with 353528 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
07:46:45 Optimization finished
First clusters only
full_join(clusters_df, umap_df) %>%
write_csv("~/my_data/integrated_thymus/F74_ATAC_clustersUMAP_20191021.csv")
Joining, by = "cell"